#uncomment the following line if you haven't installed these packages yet
#install.packages(c('vegan', 'cluster', 'rioja', 'gplots', 'adjclust))
library('vegan')
library('cluster')
library('rioja')
library('gplots')
library('adjclust')
# also uses stats package, included with base R.
We can use the Samwell Cave and pollen datasets from yesterday:
load(file="data/scpdFauna_cleaned.RData") #object is scpdSM
pollen12K <- read.csv(file="data/12000.interp.data_clean_sites.csv")
We will apply three different distance measures to our data: euclidean, bray-curtis, and chao.
scpdSM_euc <- vegdist(scpdSM, method="euclidean")
scpdSM_bray <- vegdist(scpdSM, method="bray")
scpdSM_chao <- vegdist(scpdSM, method="chao")
# print(scpdSM_euc)
# print(scpdSM_bray)
# print(scpdSM_chao)
?hclust
clust_euc_ward.D2 <- hclust(scpdSM_euc, method="ward.D2")
clust_euc_neighbor <- hclust(scpdSM_euc, method="single")
clust_euc_upgma <- hclust(scpdSM_euc, method="average") #UPGMA
clust_euc_wpgma <- hclust(scpdSM_euc, method="mcquitty") #WPGMA
clust_bray_ward.D2 <- hclust(scpdSM_bray, method="ward.D2")
clust_bray_neighbor <- hclust(scpdSM_bray, method="single")
clust_bray_upgma <- hclust(scpdSM_bray, method="average") #UPGMA
clust_bray_wpgma <- hclust(scpdSM_bray, method="mcquitty") #WPGMA
clust_chao_ward.D2 <- hclust(scpdSM_chao, method="ward.D2")
clust_chao_neighbor <- hclust(scpdSM_chao, method="single")
clust_chao_upgma <- hclust(scpdSM_chao, method="average") #UPGMA
clust_chao_wpgma <- hclust(scpdSM_chao, method="mcquitty") #WPGMA
plotting_objs <- ls()[grep("clust", ls())]
print(plotting_objs)
[1] "clust_bray_neighbor" "clust_bray_upgma" "clust_bray_ward.D2" "clust_bray_wpgma"
[5] "clust_chao_neighbor" "clust_chao_upgma" "clust_chao_ward.D2" "clust_chao_wpgma"
[9] "clust_euc_neighbor" "clust_euc_upgma" "clust_euc_ward.D2" "clust_euc_wpgma"
# Note: I'm writing the cluster diagram to a pdf file, so it's easier to read. You can uncomment the pdf() and dev.off() lines to print to the screen, or just not copy those lines
pdf(file="output/cluster_diagrams.pdf", height=11, width=8.5)
par(mfcol=c(4,3), mar=c(2,4,1,1)+.01)
for (i in 1:length(plotting_objs)){
plot(get(plotting_objs[i]), cex=0.75, xlab=NA, sub=NA, main=plotting_objs[i], hang=-.1)
}
dev.off()
null device
1
dissimColors <- gray.colors(30, start = 0.9, end = 0.1, gamma = 2.2, alpha = NULL)
ages <- as.numeric(rownames(scpdSM))
ageColors <- rainbow(length(ages), start=0, end=4/6)
heatmap(scpdSM, col=dissimColors, distfun=function(m) vegdist(m,method="bray"), hclustfun=function(m) hclust(m, method="average"), RowSideColors = ageColors)
Note 1: I’ve added some colors to the age labels. If adjacent strata clustered together, the colors would be ordered in a gradient. Note 2: in this case the dendrogram for the taxon names is meaningless, so let’s suppress it. But we can’t do this with the base heatmap function. We need to use a function in a different package: gplots
heatmap.2(scpdSM, dendrogram='row', col=dissimColors, distfun=function(m) vegdist(m,method="bray"), hclustfun=function(m) hclust(m, method="average"), RowSideColors = ageColors, tracecol=NA)
See the reference paper in the refs folder (Grimm 1987) for more info on the CONISS implementation. Note, here we have compared the data to a broken-stick distribution to estimate the minimum number of significant clusters. A broken-stick distribution is simple to calculate and the expected values of the pieces of the broken stick are given by Equation 1: Ej = 1/n * (sum(from x=j to n) 1/x) Ej is the expected value of the jth piece, and n is the number of pieces (breaks in this case)
constrained <- chclust(scpdSM_bray, method="coniss")
bstick <- bstick(constrained, 10)
ngroups <- min(which(bstick$dispersion>bstick$bstick)) # in this case, no clusters should be retained because even with two groups, the broken stick values are higher than the observed Sum of Squares.
x <- strat.plot(scpdSM, yvar=ages, title="", y.rev=T, cex.xlabel=0.5,cex.yaxis=0.5, cex.axis=0.5, clust=constrained)
addClustZone(x, constrained, col="red", ngroups)
Add this constrained clustering dendrogram to the heatmap instead of the default hclust dendro. Note that now the colors/ages are ordered.
consDendro <- as.dendrogram(constrained)
heatmap.2(scpdSM, Rowv= consDendro, col=dissimColors, distfun=function(m) vegdist(m,method="bray"), RowSideColors = ageColors, tracecol=NA, dendrogram="row")
I ran across this package for constrained clustering: adjclust. See this website https://github.com/rstats-gsoc/gsoc2017/wiki/Constrained-Hierarchical-Agglomerative-Clustering for an in-depth discussion
fit1 <- adjClust(scpdSM_bray, type="dissimilarity")
Note: input class is 'dist' so 'type' is supposed to be 'dissimilarity'
Note: 1 merges with non increasing heights.
plot(fit1, leaflab="perpendicular")
Detected reversals in dendrogram: mode = 'corrected', 'within-disp' or 'total-disp' might be more relevant.